## Setup

# clear workspace
rm(list = ls()); gc()

# load packages
library(tidyverse)
library(dplyr)
library(readxl)
library(zoo)

# set working directory to source file location
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))


# ~~~~~~~~~~


# This script takes <5 minutes to run


# ~~~~~~~~~~


## Load survey data and prep covariates

# load survey data
setwd("../data/raw")
data <- read.csv("covid_19_economist_stacked_replication.csv")

# gender
data <- data %>%
  mutate(gender = if_else(gender == 2, 1, 0)) %>%
  rename(female = gender)

# race
data$race <- ifelse(data$race == 1, "White", 
                    ifelse(data$race == 2, "Black", 
                           ifelse(data$race == 3, "Hispanic", "Other"))) %>% as.factor() %>%
  relevel(data$race, ref = "Hispanic") %>%
  relevel(data$race, ref = "Black") %>%
  relevel(data$race, ref = "White")

# education
data$educ3 <- ifelse(data$educ == 1 | data$educ == 2, "High school or less", 
                     ifelse(data$educ == 3 | data$educ == 4, "Some or 2-year college", 
                            "Bachelor's degree or more")) %>% as.factor() %>%
  relevel(data$educ3, ref = "Some or 2-year college") %>%
  relevel(data$educ3, ref = "High school or less")

# income
data$inc3 <- ifelse(data$faminc <= 4, "Less than $40,000", 
                    ifelse(data$faminc >= 5 & data$faminc <= 8, "$40,000 - $79,999", 
                           ifelse(data$faminc >= 9 & data$faminc <= 16, "More than $80,000", NA))) %>% as.factor() %>%
  relevel(data$inc3, ref = "$40,000 - $79,999") %>%
  relevel(data$inc3, ref = "Less than $40,000")

# ideology
data$ideo5 <- ifelse(data$ideo5 == 1, "Very liberal", 
                     ifelse(data$ideo5 == 2, "Liberal", 
                            ifelse(data$ideo5 == 3, "Moderate", 
                                   ifelse(data$ideo5 == 4, "Conservative", 
                                          ifelse(data$ideo5 == 5, "Very conservative", "Not sure")))))

# load and merge total COVID deaths by day from CDC
usdeaths <- read.csv("data_table_for_total_deaths__united_states.csv") %>%
  rename(surveydate = Date, total_deaths = Total.Deaths)
data <- left_join(data, usdeaths); rm(usdeaths)


# ~~~~~~~~~~


## Map ZIP codes to counties

# load ZIP code to county crosswalk file
crosswalk <- read.csv("ZIP_COUNTY_122020.csv")

# add 0s to ZIP codes that are fewer than 5 characters long | this is necessary for merging
data$zip_code <- ifelse(nchar(data$zip_code) == 5, data$zip_code, 
                        ifelse(nchar(data$zip_code) == 4, paste("0", data$zip_code, sep = ""),
                               ifelse(nchar(data$zip_code) == 3, paste("00", data$zip_code, sep = ""),
                                      ifelse(nchar(data$zip_code) == 2, paste("000", data$zip_code, sep = ""),
                                             ifelse(nchar(data$zip_code) == 1, paste("0000", data$zip_code, sep = ""), NA)))))

crosswalk$ZIP <- ifelse(nchar(crosswalk$ZIP) == 5, crosswalk$ZIP, 
                        ifelse(nchar(crosswalk$ZIP) == 4, paste("0", crosswalk$ZIP, sep = ""), paste("00", crosswalk$ZIP, sep = "")))

# add 0s to county FIPS codes that are fewer than 5 characters long | this is necessary for merging
crosswalk$COUNTY <- ifelse(nchar(crosswalk$COUNTY) == 4, paste("0", crosswalk$COUNTY, sep = ""), crosswalk$COUNTY)

# initialize county FIPS variable
data$FIPS <- NA


# calculate proportion of respondents who entered a wrong ZIP/can be matched exactly to one ZIP for footnote 3

# count people with missing or nonexistence ZIP
f3.1 <- (sum(!(data$zip_code %in% crosswalk$ZIP) | is.na(data$zip_code))/nrow(data))*100

# what proportion of respondents with good ZIP can be matched to exactly one county?
data <- data %>%
  mutate(good_zip = case_when(is.na(zip_code) ~ 0,
                              !(zip_code %in% crosswalk$ZIP) ~ 0,
                              .default = 1))

# extract list of ZIP codes that can be assigned to exactly one county
zips <- crosswalk[table(crosswalk$ZIP) == 1,]$ZIP

# calculate proportion
f3.2 <- sum(data[data$good_zip == 1,]$zip_code %in% zips)/(nrow(data)-sum(!(data$zip_code %in% crosswalk$ZIP) | is.na(data$zip_code)))*100

# save as txt file
setwd("../../output")
writeLines(paste(paste("Percent of respondents who failed to provide a ZIP code or entered a nonexistent ZIP code:", f3.1),
                 paste("Percent of respondents who can be mapped onto exactly one county:", f3.2),
                 sep = "\n"), "footnote_3.txt"); rm(f3.1, f3.2)


# assign ZIP codes probabilistically based on proportion of ZIP code addresses in each county

# loop over respondents
for(i in 1:nrow(data)){
  
  # if zip_code corresponds to only one county
  if(sum(crosswalk$ZIP == data[i,]$zip_code, na.rm = T) == 1){
    
    # set data$FIPS to its value from crosswalk
    data[i,]$FIPS <- crosswalk[crosswalk$ZIP == data[i,]$zip_code,]$COUNTY
  
  # if zip_code corresponds to multiple counties
  }else if(sum(crosswalk$ZIP == data[i,]$zip_code, na.rm = T) > 1){
    
    # assign data$FIPS randomly based on proportion of RES_RATIO
    set.seed(12345)
    data[i,]$FIPS <- sample(x = crosswalk[crosswalk$ZIP == data[i,]$zip_code,]$COUNTY, 
                              size = 1, 
                              prob = crosswalk[crosswalk$ZIP == data[i,]$zip_code,]$RES_RATIO)
    
  }
}; rm(i, crosswalk, zips)

data$FIPS <- as.numeric(data$FIPS)

# drop participants with no ZIP
data <- data %>% filter(!is.na(FIPS))


# ~~~~~~~~~~


## Fix PID variable and record and save proportion of independents and 'don't knows' (footnote 12 pg. 14)

# proportion of independents
writeLines(paste(paste("Proportion of Independents:", 100*round(sum(data$pid7 == 4)/nrow(data), 2)),
                 paste("Proportion of don't knows:", 100*round(sum(data$pid7 == 8)/nrow(data), 2)), sep = "\n"), "footnote_12.txt")

# partisan ID
data$pid7 <- ifelse(data$pid7 == 1, "Strong Democrat", 
                    ifelse(data$pid7 == 2, "Not very strong Democrat", 
                           ifelse(data$pid7 == 3, "Lean Democrat", 
                                  ifelse(data$pid7 == 4 | data$pid7 == 8, "Independent", 
                                         ifelse(data$pid7 == 5, "Lean Republican", 
                                                ifelse(data$pid7 == 6, "Not very strong Republican", 
                                                       ifelse(data$pid7 == 7, "Strong Republican", NA)))))))

# create trichotomous party variable
data$party <- ifelse(data$pid7 %in% c("Strong Republican", "Not very strong Republican", "Lean Republican"), "Republican", 
                     ifelse(data$pid7 == "Independent", "Independent", "Democrat"))


# ~~~~~~~~~~


## Merge in COVID-19 cases and deaths

# load Johns Hopkins COVID data
setwd("../data/raw")
cases <- read.csv("time_series_covid19_confirmed_US.csv") %>%
  dplyr::select(-c(UID, iso2, iso3, code3, Country_Region, Lat, Long_, Combined_Key)) %>%
  rename(county = Admin2, state = Province_State)
deaths <- read.csv("time_series_covid19_deaths_US.csv") %>% 
  dplyr::select(-c(UID, iso2, iso3, code3, Country_Region, Lat, Long_, Combined_Key)) %>%
  rename(population = Population, county = Admin2, state = Province_State)

# pivot datasets to long so date is in one column variable
cases <- pivot_longer(cases, cols = starts_with("X"), names_to = "surveydate", values_to = "cases")
deaths <- pivot_longer(deaths, cols = starts_with("X"), names_to = "surveydate", values_to = "deaths")

# remove observations with missing FIPS
cases <- cases %>% filter(!is.na(FIPS))
deaths <- deaths %>% filter(!is.na(FIPS))

# convert date variables to Date to merge
data$surveydate <- as.Date(data$surveydate, tryFormats = "%m/%d/%Y")
cases$surveydate <- gsub("X", "", cases$surveydate)
deaths$surveydate <- gsub("X", "", deaths$surveydate)
cases$surveydate <- as.Date(cases$surveydate, tryFormats = "%m.%d.%Y")
deaths$surveydate <- as.Date(deaths$surveydate, tryFormats = "%m.%d.%Y")

# create measure of cumulative cases and deaths in the 30 days leading up to survey administration
cases <- cases %>% group_by(FIPS) %>% mutate(cases_30 = cases - lag(cases, 30, default = 0))
deaths <- deaths %>% group_by(FIPS) %>% mutate(deaths_30 = deaths - lag(deaths, 30, default = 0))

# merge into survey data
data <- left_join(data, cases, relationship = "many-to-one"); rm(cases)
data <- left_join(data, deaths %>% dplyr::select(-c(county, state)), relationship = "many-to-one"); rm(deaths)


# ~~~~~~~~~~


## Merge in county-level covariates

# (1) Hospital beds

# load and prep data
beds <- read.csv("American_Hospital_Association_2019.csv") %>% 
  dplyr::select(-c(ID, MLOCSTCD, FCNTYCD, CNTYNAME, YEAR)) %>%
  rename(beds = HOSPBD, FIPS = FCOUNTY) %>%
  group_by(FIPS) %>%
  summarize(beds = sum(beds))

# merge into survey data
data <- left_join(data, beds); rm(beds)

# replace any NAs with 0s
data$beds <- ifelse(is.na(data$beds), 0, data$beds)


# (2) Educational attainment

# load and prep data
education <- read.csv("Education.csv") %>%
  dplyr::select(c(FIPS.Code, Percent.of.adults.with.a.bachelor.s.degree.or.higher..2015.19)) %>%
  rename(FIPS = FIPS.Code, bachelors = Percent.of.adults.with.a.bachelor.s.degree.or.higher..2015.19)

# merge into survey data
data <- left_join(data, education); rm(education)


# (3) Black and Hispanic population

# load and prep data
race <- read.csv("county_race.csv") %>%
  dplyr::select(-NAME) %>%
  mutate(GEO_ID = substr(GEO_ID, nchar(GEO_ID)-4, nchar(GEO_ID)), # fix FIPS code
         GEO_ID = as.numeric(GEO_ID)) %>%
  rename(FIPS = GEO_ID)

# merge into survey data
data <- left_join(data, race); rm(race)

# convert population totals to proportions
data <- data %>% 
  mutate(BLACK = BLACK/population,
         HISPANIC = HISPANIC/population) %>% 
  rename(black = BLACK, hispanic = HISPANIC)


# (4) Median household income

# load and prep data
income <- read.csv("est19all.csv") %>%
  dplyr::select(c(FIPS, Median.Household.Income)) %>%
  rename(median_household_income = Median.Household.Income) %>%
  mutate(median_household_income = gsub("[[:punct:] ]+", "", median_household_income), # remove commas
         median_household_income = as.numeric(median_household_income)) # one tiny HI county has value '.' for privacy

# merge into survey data
data <- left_join(data, income); rm(income)


# (5) Republican 2016 presidential vote share

# load and prep data
votes <- read.csv("countypres_2016.csv") %>% 
  filter(year == 2016 & candidate != "Other" & !is.na(FIPS)) %>% # subset to only include 2016 and to include only two-party vote share and to remove missing FIPS (e.g. for statewide write-ins)
  dplyr::select(-c(version, year, office, candidate))

votes <- pivot_wider(votes, id_cols = FIPS, names_from = party, values_from = candidatevotes)
votes$rep_share <- votes$republican/(votes$democrat + votes$republican)

# merge into survey data
data <- left_join(data, votes %>% dplyr::select(c(FIPS, rep_share))); rm(votes)


# (6) Unemployment and (7) Rural-urban continuum

# load and prep data
econ <- read_excel("Unemployment.xlsx", sheet = "UnemploymentMedianIncome") %>%
  dplyr::select(c(FIPS_Code, Rural_Urban_Continuum_Code_2013, Unemployment_rate_2019)) %>%
  rename(FIPS = FIPS_Code, rural_urban_continuum = Rural_Urban_Continuum_Code_2013, unemployment = Unemployment_rate_2019) %>%
  mutate(FIPS = as.numeric(FIPS))

# merge into survey data
data <- left_join(data, econ); rm(econ)


# ~~~~~~~~~~


## Construct misperception variables

# extract unique survey dates
weeks <- unique(data$surveydate)

# regardless of survey date, anyone who responded (6) to covid_deaths is marked as overestimating
data$over <- ifelse(data$covid_deaths == 6, 1, 0)

# initialize underestimation variables 
data$under <- NA

# loop over weeks
for(w in weeks){
  
  # extract number of deaths by that survey date
  deaths <- unique(data[data$surveydate == w,]$total_deaths)
    
  # if deaths <= 100k, no one under
  if(deaths <= 100000){
  
    data[data$surveydate == w,]$under <- 0
      
  # if deaths > 100k & <= 150k, under = anyone who says (1)  
  }else if(deaths > 100000 & deaths <= 150000){
      
    data[data$surveydate == w,]$under <- ifelse(data[data$surveydate == w,]$covid_deaths == 1, 1, 0)
      
  # if deaths > 150k & <= 200k, under = anyone who says (1) or (2)  
  }else if(deaths > 150000 & deaths <= 200000){
      
    data[data$surveydate == w,]$under <- ifelse(data[data$surveydate == w,]$covid_deaths %in% c(1, 2), 1, 0)
      
  # if deaths > 200k & <= 250k, under = anyone who says (1), (2), or (3)  
  }else if(deaths > 200000 & deaths <= 250000){
      
    data[data$surveydate == w,]$under <- ifelse(data[data$surveydate == w,]$covid_deaths %in% c(1, 2, 3), 1, 0)
      
  # if deaths > 250k, under = anyone who says (1), (2), (3), or (4)  
  }else{
      
    data[data$surveydate == w,]$under <- ifelse(data[data$surveydate == w,]$covid_deaths %in% c(1, 2, 3, 4), 1, 0)
      
  }
}; rm(w, deaths, weeks)

# fix NAs
data$under <- ifelse(is.na(data$covid_deaths), NA, data$under)


## Generate Figure 1: Misperceptions over time by party

# get under-estimations data in right format
plot_under <- data %>%
  filter(party %in% c("Democrat", "Republican"), !is.na(weight)) %>%
  group_by(party, surveydate) %>%
  summarize(prediction = 100*weighted.mean(under, weight, na.rm = T)) %>%
  rename(Party = party) %>%
  mutate(DV = "under",
         period = case_when(surveydate %in% c("2020-05-23", "2020-05-30", "2020-06-06", "2020-06-13") ~ 1,
                            surveydate %in% c("2020-08-15", "2020-08-22", "2020-08-29", "2020-09-05", "2020-09-12") ~ 2,
                            surveydate %in% c("2020-09-19", "2020-09-26", "2020-10-03", "2020-10-10", "2020-10-17", "2020-10-24", "2020-10-31", "2020-11-07") ~ 3,
                            surveydate %in% c("2020-11-14", "2020-11-21") ~ 4),
         surveydate = as.Date(surveydate, tryFormats = "%Y/%m/%d"))

# get over-estimations data in right format
plot_over <- data %>%
  filter(party %in% c("Democrat", "Republican"), !is.na(weight)) %>% 
  group_by(party, surveydate) %>% 
  summarize(prediction = 100*weighted.mean(over, weight, na.rm = T)) %>%
  rename(Party = party) %>%
  mutate(DV = "over",
         period = case_when(surveydate %in% c("2020-05-23", "2020-05-30", "2020-06-06", "2020-06-13") ~ 5,
                            surveydate %in% c("2020-08-15", "2020-08-22", "2020-08-29", "2020-09-05", "2020-09-12") ~ 6,
                            surveydate %in% c("2020-09-19", "2020-09-26", "2020-10-03", "2020-10-10", "2020-10-17", "2020-10-24", "2020-10-31", "2020-11-07") ~ 7,
                            surveydate %in% c("2020-11-14", "2020-11-21") ~ 8),
         surveydate = as.Date(surveydate, tryFormats = "%Y/%m/%d"))

# merge
toplot <- bind_rows(plot_under, plot_over); rm(plot_under, plot_over)

# save dates and labels
dates <- c("2020-05-23", "2020-05-30", "2020-06-06", "2020-06-13", "2020-06-20", "2020-06-27", "2020-07-04", "2020-07-11", 
           "2020-07-18", "2020-07-25", "2020-08-01", "2020-08-08", "2020-08-15", "2020-08-22", "2020-08-29", "2020-09-05", 
           "2020-09-12", "2020-09-19", "2020-09-26", "2020-10-03", "2020-10-10", "2020-10-17", "2020-10-24", "2020-10-31", 
           "2020-11-07", "2020-11-14", "2020-11-21")

labels <- c("May 23", "May 30", "Jun 6", "Jun 13", "", "", "", "", "", "", "",  "", "Aug 15", "Aug 22", "Aug 29", "Sep 5", 
            "Sep 12", "Sep 19", "Sep 26", "Oct 3", "Oct 10", "Oct 17", "Oct 24", "Oct 31", "Nov 7", "Nov 14", "Nov 21")

# plot
p <- toplot %>%
  ggplot(aes(x = surveydate, y = prediction, fill = as.factor(period))) + 
  geom_point(aes(color = Party, shape = DV), size = 4.4) + 
  geom_smooth(aes(x = surveydate, y = prediction, color = Party, linetype = DV), method = "lm", se = F) + 
  scale_color_manual(values = c("blue", "red")) + 
  theme_bw() + 
  labs(x = "", y = "Percent misperceiving") + 
  guides(fill = "none", linetype = "none") + 
  theme(legend.position = "bottom",
        text = element_text(size=22),
        axis.text.x = element_text(angle = 90)) + 
  scale_y_continuous(labels=function(x) paste0(x,"%")) + 
  scale_x_date(breaks = as.Date(dates),
               labels = labels)

# save
ggsave(filename = "Figure 1.png", plot = p, device = "png", path = "../../output", width = 15, height = 10, units = "in", bg = "white")


## Generate Figure A.1: Violin plot of death estimations

# group data into months
data$month <- strftime(data$surveydate, format = "%B") %>% as.factor() %>%
  relevel(data$month, ref = "November") %>%
  relevel(data$month, ref = "October") %>%
  relevel(data$month, ref = "September") %>%
  relevel(data$month, ref = "August") %>%
  relevel(data$month, ref = "June") %>%
  relevel(data$month, ref = "May")

# create variable for adding total deaths graph | these factors map numeric values onto the ordinal Y axis
data$month_deaths <- ifelse(data$month == "May", 1.22, 
                            ifelse(data$month == "June", 1.58,
                                   ifelse(data$month == "August", 2.8,
                                          ifelse(data$month == "September", 3.18, 
                                                 ifelse(data$month == "October", 3.68, 4.34)))))

data$month_deaths_labels <- ifelse(data$month == "May", "111k", 
                                   ifelse(data$month == "June", "129k",
                                          ifelse(data$month == "August", "190k",
                                                 ifelse(data$month == "September", "209k", 
                                                        ifelse(data$month == "October", "234k", "287k")))))

# plot
p <- data %>%
  filter(party %in% c("Republican", "Democrat"), !is.na(covid_deaths)) %>%
  ggplot(aes(x = month, y = covid_deaths, fill = party)) + 
  geom_violin(position = position_dodge(width = 0.6)) + 
  scale_fill_manual(values = c("blue", "red")) +
  scale_y_continuous(breaks = 1:6, labels = c("<100k", "100-150k", "150-200k", "200-250k", "250k-1m", ">1m")) + 
  ylab("Respondent COVID-19 fatality estimate") +
  theme_bw() + 
  theme(axis.title.x = element_blank(),
        legend.position="bottom",
        legend.title = element_blank(), 
        text = element_text(size=22)) + 
  geom_text(aes(x = month, y = month_deaths, label = month_deaths_labels), size = 6, show.legend = F)

# save
ggsave(filename = "Figure A.1.png", plot = p, device = "png", path = "../../output", width = 15, height = 10, units = "in", bg = "white")

# drop unnecessary variables from data
data <- data %>% dplyr::select(-c(month, month_deaths, month_deaths_labels)); rm(p, toplot, labels, dates)


# ~~~~~~~~~~


## Merge in topic model

# load topic model
setwd("../processed")
topics <- readRDS("topic_model.rds")

# add week variable to main data for merging
data$week <- as.numeric(strftime(data$surveydate, format = "%V"))

# pivot topic model wider for merging, create one measure for each source, and average USA Today and ABC
topics <- topics %>% pivot_wider(names_from = source, values_from = estimate) %>%
  rename(ABC_covid_estimate = ABC, FOX_covid_estimate = Fox, MSNBC_covid_estimate = MSNBC, USA_covid_estimate = `USA Today`) %>%
  mutate(covid_nonpartisan_estimate = (ABC_covid_estimate+USA_covid_estimate)/2)

# lag topics by 1 week to merge based on coverage in the week before respondents' survey administration
topics$week <- topics$week-1

# make sure topics is arranged by week
topics <- topics %>% arrange(week)

# create ~two-week~ and ~month~ versions of nonpartisan press estimate
topics <- topics %>% 
  mutate(covid_2weeks_estimate = rollapply((ABC_covid_estimate + USA_covid_estimate)/2,
                                           width = 2,
                                           FUN = sum,
                                           align = "right", 
                                           fill = (ABC_covid_estimate + USA_covid_estimate)/2))
topics <- topics %>% 
  mutate(covid_1month_estimate = rollapply((ABC_covid_estimate + USA_covid_estimate)/2,
                                           width = 4,
                                           FUN = sum,
                                           align = "right", 
                                           fill = (ABC_covid_estimate + USA_covid_estimate)/2))

# generate Figure A.10: Aggregate COVID-19 prevalence over time in last week, two weeks, and month
p <- topics %>% 
  dplyr::select(-c(ABC_covid_estimate, FOX_covid_estimate, MSNBC_covid_estimate, USA_covid_estimate)) %>%
  pivot_longer(cols = c(covid_nonpartisan_estimate, covid_2weeks_estimate, covid_1month_estimate), names_to = "Duration", values_to = "prop") %>%
  mutate(Duration = case_when(Duration == "covid_nonpartisan_estimate" ~ "1 week",
                              Duration == "covid_2weeks_estimate" ~ "2 weeks",
                              Duration == "covid_1month_estimate" ~ "1 month")) %>%
  ggplot(aes(x = week, y = prop, color = Duration)) + 
  geom_line() + 
  theme_bw() + 
  scale_x_continuous(breaks = c(1, 31/7, 59/7, 90/7, 120/7, 151/7, 181/7, 212/7, 243/7, 273/7, 304/7, 334/7), 
                     labels = c("Jan", "Feb", "March", "April", "May", "June", "July", "Aug", "Sept", "Oct", "Nov", "Dec")) + 
  theme(legend.position = "bottom",
        text = element_text(size=22)) + 
  labs(x = "", y = "Expected Topic Proportion")

# save
ggsave(filename = "Figure A.10.png", plot = p, device = "png", path = "../../output", width = 15, height = 10, units = "in", bg = "white"); rm(p)

# generate numbers for Appendix A.12 and save as txt file
setwd("../../output")
writeLines(paste(paste("Proportion of range in media attention (1 week):", round((max(topics[topics$week %in% data$week,]$covid_nonpartisan_estimate)-min(topics[topics$week %in% data$week,]$covid_nonpartisan_estimate))/(max(topics$covid_nonpartisan_estimate)-min(topics$covid_nonpartisan_estimate)),3)), 
                 paste("Proportion of range in media attention (1 month):", round((max(topics[topics$week %in% data$week,]$covid_1month_estimate)-min(topics[topics$week %in% data$week,]$covid_1month_estimate))/(max(topics$covid_1month_estimate)-min(topics$covid_1month_estimate)),3)),
                 sep = "\n"), "Appendix A.12.txt")

# merge into main data and save output
data <- left_join(data, topics); rm(topics)
setwd("../data/processed")
saveRDS(data, "master_data.rds")

